Introduction

This is a draft, the analysis is still on-going.

This document focuses on exploring the relationship between the census variables.

Setup

library(tidyverse)
library(magrittr)
library(knitr)
library(GGally)

Data

Load the transformed census data.

census_data_trans <-
  read_csv(
    "../storage/dati-cpa_2011_all-trans-v0_0_4.csv",
    col_types = paste(c(rep("c", 12), rep("d", 125)), collapse="")
  )

Correlations

Calculate the correlation between the transformed variables to identify those that might be excluded from the analysis.

candidate_vars <- 
  census_data_trans %>% 
  select(P1_norm_log10_std:E30_E31_norm_std) %>% 
  colnames()

candidate_vars_cor <- NA

for (i in 1:(length(candidate_vars) - 1)) {
  for(j in (i + 1):length(candidate_vars)) {
    #cat("Calculating correlation between", candidate_vars[i], "and", candidate_vars[j],"\n")
    census_data_trans_sample <-
      census_data_trans %>% 
      slice_sample(prop = 0.01)
    ij_cor_test <- cor.test(
      census_data_trans_sample %>% pull(candidate_vars[i]), 
      census_data_trans_sample %>% pull(candidate_vars[j]), 
      method = "kendall"
    )
    if(i == 1 & j == 2){
      candidate_vars_cor <-
        tibble(
          var_i = candidate_vars[i],
          var_j = candidate_vars[j],
          estimate = ij_cor_test %$% estimate %>% as.numeric(),
          p_value = ij_cor_test %$% p.value %>% as.numeric()
        )
    } else {
      candidate_vars_cor <-
        candidate_vars_cor %>% 
        add_row(
          var_i = candidate_vars[i],
          var_j = candidate_vars[j],
          estimate = ij_cor_test %$% estimate %>% as.numeric(),
          p_value = ij_cor_test %$% p.value %>% as.numeric()
        )
    }
  }
}

Further explore the most highly correlated variables, including all correlations with coefficient above \(0.5\) (indicating a share variability above \(25%\), in orange in the annotated chart below) and focusing in particular on correlations with coefficient above \(0.7\) (indicating a share variability above \(50%\), in red in the annotated chart below).

correlations_cutoff_p_value <- 0.01
correlations_cutoff_estimate <- 0.5

candidate_vars_cor %>% 
  filter(
    p_value < correlations_cutoff_p_value & 
    estimate > correlations_cutoff_estimate
  ) %>% 
  kable()
var_i var_j estimate p_value
P1_norm_log10_std A44_norm_log10_std 0.9138809 0
P1_norm_log10_std PF1_norm_log10_std 0.9406808 0
P1_norm_log10_std E1_norm_log10_std 0.5447581 0
P7_norm_log10_std P29_norm_log10_std 0.5189718 0
P9_norm_std P53_norm_std 0.5028844 0
P17_norm_log10_std P131_norm_log10_std 0.5795901 0
P29_norm_log10_std P139_norm_std 0.5779303 0
P33_norm_std P132_norm_std 0.5205444 0
P64_norm_std P65_norm_std 0.8209654 0
ST1_norm_log10_std ST2_norm_std 0.5786968 0
ST1_norm_log10_std ST3_norm_std 0.5546127 0
ST2_norm_std ST3_norm_std 0.5661384 0
A3_norm_std A5_A6_A7_norm_std 0.9762379 0
A44_norm_log10_std PF1_norm_log10_std 0.9270664 0
A44_norm_log10_std E1_norm_log10_std 0.5687334 0
A44_norm_log10_std E20_norm_std 0.5047472 0
PF1_norm_log10_std E1_norm_log10_std 0.5562196 0
PF1_norm_log10_std E24_E25_E26_norm_std 0.5253912 0
PF2_norm_std PF6_PF7_PF8_norm_std 0.7295336 0
E20_norm_std E24_E25_E26_norm_std 0.6188560 0
E20_norm_std E27_norm_log10_std 0.5620777 0
E24_E25_E26_norm_std E27_norm_log10_std 0.7589257 0
correlations_to_explore <-
  c(
    candidate_vars_cor %>% 
      filter(
        p_value < correlations_cutoff_p_value & 
        estimate > correlations_cutoff_estimate
      ) %>% 
      pull(var_i),
    candidate_vars_cor %>% 
      filter(
        p_value < correlations_cutoff_p_value & 
        estimate > correlations_cutoff_estimate
      ) %>% 
      pull(var_j)
  ) %>% 
  unique()

correlations_to_explore_panel <-
  census_data_trans %>%
  slice_sample(prop = 0.01) %>% 
  select({{correlations_to_explore}}) %>%
  ggpairs(
    upper = list(continuous = wrap(ggally_cor, method = "kendall")),
    lower = list(continuous = wrap("points", alpha = 0.3, size=0.1))
  )
print(correlations_to_explore_panel)

# ggsave(
#   "../100-prep/111-classification-variable-selection-top-correlations-v0_0_4.png",
#   correlations_to_explore_panel,
#   width = 900,
#   height = 900,
#   units = "mm",
#   dpi=300
# )

The figure below is an annotated version of the plot above.

Variable code Variable description Normalisation code Normalisation description
P1 Popolazione residente - totale Area (Km2)
P64 Popolazione residente - maschi di 15 anni e più appartenente alle forze di lavoro P60 Popolazione residente - totale di 15 anni e più appartenente alle forze di lavoro totale
P65 Popolazione residente - maschi di 15 anni e più occupata (FL) P61 Popolazione residente - totale di 15 anni e più occupata (FL)
A3 Abitazioni vuote e abitazioni occupate solo da persone non residenti Abitazioni (all) ?
A5 Altri tipi di alloggio occupati Abitazioni (all) ?
A6 Abitazioni vuote Abitazioni (all) ?
A7 Abitazioni occupate solo da persone non residenti Abitazioni (all) ?
A44 Superficie delle abitazioni occupate da almeno una persona residente Area (Km2) ?
PF1 Famiglie residenti - totale Area (Km2) ?
PF2 Famiglie residenti - totale componenti PF1 Famiglie residenti - totale
PF6 Famiglie residenti - 4 componenti PF1 Famiglie residenti - totale
PF7 Famiglie residenti - 5 componenti PF1 Famiglie residenti - totale
PF8 Famiglie residenti - 6 e oltre componenti PF1 Famiglie residenti - totale
E24 Edifici ad uso residenziale da 5 a 8 interni E3 Edifici ad uso residenziale
E25 Edifici ad uso residenziale da 9 a 15 interni E3 Edifici ad uso residenziale
E26 Edifici ad uso residenziale con 16 interni o più E3 Edifici ad uso residenziale
E27 Totale interni in edifici ad uso residenziale E3 Edifici ad uso residenziale

Variable selection

Based on the correlations illustrated above:

Save values

census_data_trans_selected <-
  census_data_trans %>% 
  select(
    -A44_norm_log10_std, -PF1_norm_log10_std,
    -P64_norm_std,
    -A5_A6_A7_norm_std,
    -PF2_norm_std,
    -E27_norm_log10_std
  )

colnames(census_data_trans_selected)
##   [1] "CODREG"                            "REGIONE"                          
##   [3] "CODPRO"                            "PROVINCIA"                        
##   [5] "CODCOM"                            "COMUNE"                           
##   [7] "PROCOM"                            "SEZ2011"                          
##   [9] "NSEZ"                              "ACE"                              
##  [11] "CODLOC"                            "CODASC"                           
##  [13] "P1_norm_log10_std"                 "P3_norm_log10_std"                
##  [15] "P4_norm_std"                       "P5_norm_std"                      
##  [17] "P6_norm_log10_std"                 "P7_norm_log10_std"                
##  [19] "P8_norm_log10_std"                 "P9_norm_std"                      
##  [21] "P10_norm_std"                      "P11_norm_std"                     
##  [23] "P12_norm_std"                      "P13_norm_std"                     
##  [25] "P14_norm_log10_std"                "P15_norm_log10_std"               
##  [27] "P16_norm_log10_std"                "P17_norm_log10_std"               
##  [29] "P18_norm_log10_std"                "P19_norm_log10_std"               
##  [31] "P20_norm_log10_std"                "P21_norm_log10_std"               
##  [33] "P22_norm_log10_std"                "P23_norm_log10_std"               
##  [35] "P24_norm_log10_std"                "P25_norm_log10_std"               
##  [37] "P26_norm_log10_std"                "P27_norm_log10_std"               
##  [39] "P28_norm_log10_std"                "P29_norm_log10_std"               
##  [41] "P30_norm_std"                      "P31_norm_std"                     
##  [43] "P32_norm_std"                      "P33_norm_std"                     
##  [45] "P34_norm_std"                      "P35_norm_std"                     
##  [47] "P36_norm_std"                      "P37_norm_std"                     
##  [49] "P38_norm_std"                      "P39_norm_std"                     
##  [51] "P40_norm_std"                      "P41_norm_std"                     
##  [53] "P42_norm_std"                      "P43_norm_std"                     
##  [55] "P44_norm_std"                      "P45_norm_std"                     
##  [57] "P46_norm_log10_std"                "P47_norm_log10_std"               
##  [59] "P48_norm_std"                      "P49_norm_std"                     
##  [61] "P50_norm_std"                      "P51_norm_log10_std"               
##  [63] "P52_norm_log10_std"                "P53_norm_std"                     
##  [65] "P54_norm_std"                      "P55_norm_std"                     
##  [67] "P56_norm_std"                      "P57_norm_std"                     
##  [69] "P58_norm_std"                      "P59_norm_std"                     
##  [71] "P60_norm_std"                      "P61_norm_log10_std"               
##  [73] "P62_norm_log10_std"                "P65_norm_std"                     
##  [75] "P66_norm_std"                      "P128_norm_log10_std"              
##  [77] "P129_norm_std"                     "P130_norm_log10_std"              
##  [79] "P131_norm_log10_std"               "P132_norm_std"                    
##  [81] "P135_norm_log10_std"               "P136_norm_std"                    
##  [83] "P137_norm_std"                     "P138_norm_std"                    
##  [85] "P139_norm_std"                     "P140_norm_std"                    
##  [87] "ST1_norm_log10_std"                "ST2_norm_std"                     
##  [89] "ST3_norm_std"                      "ST4_norm_std"                     
##  [91] "ST5_norm_log10_std"                "ST6_norm_std"                     
##  [93] "ST7_norm_std"                      "ST8_norm_std"                     
##  [95] "ST9_norm_std"                      "ST10_ST11_ST12_ST13_ST14_norm_std"
##  [97] "A2_norm_std"                       "A3_norm_std"                      
##  [99] "A46_norm_log10_std"                "A47_norm_std"                     
## [101] "A48_norm_log10_std"                "PF3_norm_std"                     
## [103] "PF4_PF5_norm_std"                  "PF6_PF7_PF8_norm_std"             
## [105] "E1_norm_log10_std"                 "E3_norm_std"                      
## [107] "E4_norm_log10_std"                 "E5_norm_std"                      
## [109] "E6_norm_std"                       "E7_norm_log10_std"                
## [111] "E8_norm_std"                       "E9_norm_log10_std"                
## [113] "E10_norm_log10_std"                "E11_norm_std"                     
## [115] "E12_norm_log10_std"                "E13_norm_log10_std"               
## [117] "E14_E15_E16_norm_log10_std"        "E17_norm_log10_std"               
## [119] "E18_norm_std"                      "E19_norm_std"                     
## [121] "E20_norm_std"                      "E21_norm_std"                     
## [123] "E22_norm_std"                      "E23_norm_log10_std"               
## [125] "E24_E25_E26_norm_std"              "E28_norm_std"                     
## [127] "E29_norm_std"                      "E30_E31_norm_std"
census_data_trans_selected %>% 
  write_csv("../storage/dati-cpa_2011_all-trans-selected-v0_0_4.csv") 

Conclusions

This is a draft, the analysis is still on-going.

The notes in the Variable selection section above need to be revised and the trasnformation process updated accordingly.

Acknowledgements

This analysis uses data from ISTAT distributed under CC BY 3.0 IT (see also legal notice).